library(macpie)
library(tibble)
library(stringr)
library(pheatmap)
library(ggiraph)
library(tidyseurat)
## Loading required package: ttservice
##
## Attaching package: 'ttservice'
## The following objects are masked from 'package:macpie':
##
## bind_cols, bind_rows, plot_ly
## Loading required package: SeuratObject
## Loading required package: sp
##
## Attaching package: 'SeuratObject'
## The following objects are masked from 'package:base':
##
## intersect, t
## ========================================
## tidyseurat version 0.8.0
## If you use TIDYSEURAT in published research, please cite:
##
## Mangiola et al. Interfacing Seurat with the R tidy universe. Bioinformatics 2021.
##
## This message can be suppressed by:
## suppressPackageStartupMessages(library(tidyseurat))
##
## To restore the Seurat default display use options("restore_Seurat_show" = TRUE)
## ========================================
##
## Attaching package: 'tidyseurat'
## The following object is masked from 'package:ttservice':
##
## plot_ly
library(purrr)
##
## Attaching package: 'purrr'
## The following object is masked from 'package:testthat':
##
## is_null
library(DT)
##
## Attaching package: 'DT'
## The following object is masked from 'package:SeuratObject':
##
## JS
## The following object is masked from 'package:macpie':
##
## JS
library(ggrepel)
library(ggalluvial)
library(ggplot2)
library(ggvenn)
## Loading required package: grid
library(limma)
##
## Attaching package: 'limma'
## The following object is masked from 'package:macpie':
##
## plotMA
# batch24 <- readRDS(paste0(dir,"DRUGseqData/Exp_batch24.Rds"))
# load(paste0(dir,"DRUGseqData/de.RData"))
# batch24_de <- de_res%>%filter(batch_id=="24")
# saveRDS(batch24_de, file = paste0(dir,"DRUGseqData/DE_batch24.Rds"))
batch24_de <- readRDS(paste0(dir,"DRUGseqData/DE_batch24.Rds"))
head(batch24_de)
## investigation_id analysis_id pipeline_run_key pipeline_group_run_key method
## 1 2384 24 27 27 limma
## 2 2384 24 27 27 limma
## 3 2384 24 27 27 limma
## 4 2384 24 27 27 limma
## 5 2384 24 27 27 limma
## 6 2384 24 27 27 limma
## normalization
## 1 quantile=0.75;TMM;log2(CPM)
## 2 quantile=0.75;TMM;log2(CPM)
## 3 quantile=0.75;TMM;log2(CPM)
## 4 quantile=0.75;TMM;log2(CPM)
## 5 quantile=0.75;TMM;log2(CPM)
## 6 quantile=0.75;TMM;log2(CPM)
## comparison_name_public
## 1 SA_UD-20-ER54_10uM_24.0_batchid24 vs RC_CB-43-EP73_0uM_24.0_batchid24
## 2 SA_UD-20-ER54_10uM_24.0_batchid24 vs RC_CB-43-EP73_0uM_24.0_batchid24
## 3 SA_UD-20-ER54_10uM_24.0_batchid24 vs RC_CB-43-EP73_0uM_24.0_batchid24
## 4 SA_UD-20-ER54_10uM_24.0_batchid24 vs RC_CB-43-EP73_0uM_24.0_batchid24
## 5 SA_UD-20-ER54_10uM_24.0_batchid24 vs RC_CB-43-EP73_0uM_24.0_batchid24
## 6 SA_UD-20-ER54_10uM_24.0_batchid24 vs RC_CB-43-EP73_0uM_24.0_batchid24
## cmpd_sample_id concentration unit hours_post_treatment batch_id
## 1 UD-20-ER54 10 uM 24 24
## 2 UD-20-ER54 10 uM 24 24
## 3 UD-20-ER54 10 uM 24 24
## 4 UD-20-ER54 10 uM 24 24
## 5 UD-20-ER54 10 uM 24 24
## 6 UD-20-ER54 10 uM 24 24
## gene.ID logFC adj.P.Val P.Value AveExpr t
## 1 MYL6,grch38_12 1.726500 6.467501e-43 4.413170e-47 11.768734 15.518950
## 2 APRT,grch38_16 2.334604 2.164408e-24 2.953815e-28 8.730357 11.508262
## 3 KRT18,grch38_12 1.157740 1.327225e-15 2.716939e-19 9.335040 9.239790
## 4 SCD,grch38_10 1.199936 1.352286e-13 3.690990e-17 7.732632 8.636743
## 5 TUBB6,grch38_18 -1.005931 3.723618e-11 1.270426e-14 8.083550 -7.873065
## 6 TIMM8B,grch38_11 1.109810 3.654986e-10 1.496412e-13 7.957763 7.532087
FF86_res <- batch24_de %>% filter(cmpd_sample_id=="FF-86-NH56")
ff86_res_toptable <- FF86_res[,13:18]
ff86_res_toptable <- ff86_res_toptable %>%
separate(gene.ID, into = c("gene", "chrom"), sep = ",") %>%
select(-chrom) %>% mutate(combined_id ="FF_86_NH56_10") %>%
rename(log2FC=logFC, p_value_adj = adj.P.Val)
head(ff86_res_toptable)
## gene log2FC p_value_adj P.Value AveExpr t
## 1 AC122713.2 4.862957 1.397952e-191 9.539077e-196 0.04920401 41.95785
## 2 TRIP6 -3.328079 5.156775e-166 7.037564e-170 9.40624753 -37.18794
## 3 MIR7-3HG 8.513923 2.799813e-151 5.731449e-155 0.20792310 34.52594
## 4 TUBB -2.898698 1.317157e-145 3.595107e-149 11.65399547 -33.50503
## 5 CYP4F2 6.454706 7.730175e-142 2.637385e-145 0.10131583 32.82814
## 6 HDGF -3.828482 2.967781e-140 1.215059e-143 9.05238910 -32.53773
## combined_id
## 1 FF_86_NH56_10
## 2 FF_86_NH56_10
## 3 FF_86_NH56_10
## 4 FF_86_NH56_10
## 5 FF_86_NH56_10
## 6 FF_86_NH56_10
plot_volcano(ff86_res_toptable, max.overlaps =5)
## Warning: ggrepel: 4342 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
ff86_res_toptable %>% filter(p_value_adj <0.05 & log2FC >0) %>% nrow()
## [1] 1423
drugseq_deg <- ff86_res_toptable %>% filter(p_value_adj <0.05 & log2FC >0) %>% select(gene) %>% pull()
#show histogram of p values
ggplot(ff86_res_toptable, aes(x = P.Value)) +
geom_histogram(binwidth = 0.01, fill = "blue", color = "black", alpha = 0.7) +
labs(title = "Histogram of Adjusted P-values", x = "Adjusted P-value", y = "Frequency") +
theme_minimal()
# mac_filtered <- filter_genes_by_expression(as_mac,
# group_by = "combined_id", min_counts = 10,
# min_samples = min_sample_num)
# saveRDS(mac_filtered,
# file = paste0(dir, "DRUGseqData/macpie_filtered_batch24.Rds"))
mac_filtered <- readRDS(paste0(dir, "/DRUGseqData/macpie_filtered_batch24.Rds"))
mac_filtered[["percent.mt"]] <- PercentageFeatureSet(mac_filtered, pattern = "^mt-|^MT-")
mac_filtered[["percent.ribo"]] <- PercentageFeatureSet(mac_filtered, pattern = "^Rp[slp][[:digit:]]|^Rpsa|^RP[SLP][[:digit:]]|^RPSA")
mac_filtered$combined_id <- str_replace_all(mac_filtered$combined_id, "-","_")
According to DRUGseq metadata:
Wells with water are labelled as EC-27-RY89
Wells with DMSO are labelled as CB-43-EP73
# mac_filtered_cp <- mac_filtered %>% filter(combined_id %in% c("CB_43_EP73_0","FF_86_NH56_10"))
mac_filtered_cp <- mac_filtered %>% filter(combined_id %in% c("CB_43_EP73_0"))
plot_rle(mac_filtered_cp, label_column = "orig.ident", normalisation = "raw")+ scale_x_discrete(drop = FALSE) +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
## tidyseurat says: Key columns are missing. A data frame is returned for independent data analysis.
## Scale for x is already present.
## Adding another scale for x, which will replace the existing scale.
plot_rle(mac_filtered_cp, label_column = "orig.ident", normalisation = "limma_voom")+ scale_x_discrete(drop = FALSE) +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
## tidyseurat says: Key columns are missing. A data frame is returned for independent data analysis.
## Scale for x is already present.
## Adding another scale for x, which will replace the existing scale.
plot_rle(mac_filtered_cp, label_column = "orig.ident", normalisation = "DESeq2")+ scale_x_discrete(drop = FALSE) +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
## tidyseurat says: Key columns are missing. A data frame is returned for independent data analysis.
## converting counts to integer mode
## Scale for x is already present.
## Adding another scale for x, which will replace the existing scale.
plot_rle(mac_filtered_cp, label_column = "orig.ident", normalisation = "edgeR")+ scale_x_discrete(drop = FALSE) +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
## tidyseurat says: Key columns are missing. A data frame is returned for independent data analysis.
## Scale for x is already present.
## Adding another scale for x, which will replace the existing scale.
Note: instead of discussing which correction methods we should use for this data, we only show the ways we detected and corrected batch effect here. As batch effect adjustment for sequencing data has been implemented in different methods, such as DESeq2, RUVSeq, edgeR. We highly recommend users thoroughly checking any batch effects and exploring different methods.
In the next part of the vignette, we demonstrate a batch
parameter has implemented in the differential expression test for batch
correction.
##Limma trend
# data <- mac_badDSMOremoved
# treatment_samples <- "FF_86_NH56_10"
# control_samples <- "CB_43_EP73_0"
limma.trend<- function(data = data, treatment_samples, control_samples){
data <- data[, grepl(paste0(treatment_samples, "|", control_samples), data$combined_id)]
batch <- data$orig.ident
if (length(unique(data$combined_id)) < 2) {
stop("Insufficient factors for differential expression analysis.")
}
pheno_data <- data.frame(batch = as.factor(batch), condition = as.factor(data$combined_id))
combined_id <- data$combined_id
model_matrix <- if (length(batch) == 1) model.matrix(~0 + combined_id) else
model.matrix(~0+combined_id + batch)
dge <- DGEList(counts = data@assays$RNA$counts,
samples = pheno_data$condition,
group = pheno_data$condition)
dge <- calcNormFactors(dge, method="TMMwsp")
# keep <- edgeR::filterByExpr(dge, design = model_matrix)
# dge <- dge[keep, , keep.lib.sizes = FALSE]
clean.TMM<-log2(edgeR::cpm(dge, normalized.lib.sizes=T,log=F)+1)
myargs <- list(paste0("combined_id",
treatment_samples, "-",
paste0("combined_id", control_samples)),
levels = model_matrix)
contrasts <- do.call(makeContrasts, myargs)
lmfit <- lmFit(clean.TMM, model_matrix)
lmfit <- contrasts.fit(lmfit, contrasts)
lmfit <- eBayes(lmfit, trend = TRUE)
#Warning: Zero sample variances detected, have been offset away from zero
top_table<- topTable(lmfit, number = nrow(data)) %>%
as.data.frame() %>%
select("logFC", "t", "P.Value", "adj.P.Val") %>%
rename("log2FC" = "logFC", "metric" = "t", "p_value" = "P.Value", "p_value_adj" = "adj.P.Val")%>%
rownames_to_column("gene")
return(top_table)}
# top_table%>%filter(p_value_adj < 0.05 & log2FC>0)
# make a function of alluvial plot with ggalluvial to compare the ranking of DEGs from macpie and DRUGseq
plot_alluvial <- function(macpie_tbl, drugseq_tbl){
df <- macpie_tbl %>%
transmute(gene, p_value_adj_mac = p_value_adj, log2FC_mac = log2FC) %>%
inner_join(
drugseq_tbl %>% transmute(gene, p_value_adj_drg = p_value_adj, log2FC_drg = log2FC),
by = "gene"
)
# Rank within each method by (padj, |log2FC|), tie-safe
rank_by_two <- function(padj, lfc) {
ord <- order(padj, -abs(lfc), na.last = TRUE)
r <- integer(length(padj))
r[ord] <- seq_along(ord)
r
}
df <- df %>%
mutate(
rank_mac = rank_by_two(p_value_adj_mac, log2FC_mac),
rank_drg = rank_by_two(p_value_adj_drg, log2FC_drg)
)
# Define rank bins
cuts <- c(0, 25, 50, 100, 200, 300, 500, Inf)
labs <- c("Top25","26–50","51–100","101–200","201–300", "301-500",">500")
df <- df %>%
mutate(
mac_bin = cut(rank_mac, breaks = cuts, labels = labs, right = TRUE, include.lowest = TRUE),
drg_bin = cut(rank_drg, breaks = cuts, labels = labs, right = TRUE, include.lowest = TRUE)
)
# Ranking movement category (color)
bin_index <- function(x) match(x, labs) # lower index = "more top"
df <- df %>%
mutate(
mac_idx = bin_index(mac_bin),
drg_idx = bin_index(drg_bin),
movement = case_when(
is.na(mac_idx) | is.na(drg_idx) ~ NA_character_,
mac_idx < drg_idx ~ "Higher rank in macpie", # moved up into a more-top bin
mac_idx > drg_idx ~ "Lower rank in macpie", # moved down
TRUE ~ "Same rank"
),
movement = factor(movement, levels = c("Higher rank in macpie","Same rank","Lower rank in macpie"))
) %>%
tidyr::drop_na(mac_bin, drg_bin, movement)
# Aggregate flows for alluvial
flows <- df %>%
count(mac_bin, drg_bin, movement, name = "n") %>%
mutate(
mac_bin = factor(mac_bin, levels = labs),
drg_bin = factor(drg_bin, levels = labs)
)
ggplot(flows, aes(axis1 = mac_bin, axis2 = drg_bin, y = n)) +
geom_alluvium(aes(fill = movement), alpha = 0.75, width = 0.14, knot.pos = 0.4) +
geom_stratum(width = 0.14, color = "grey30") +
geom_text(stat = "stratum", aes(label = after_stat(stratum)), size = 3) +
scale_x_discrete(limits = c("Macpie rank", "DRUGseq rank"), expand = c(.05, .05)) +
scale_fill_manual(values = c("Higher rank in macpie" = "orange",
"Same rank" = "#7570b3",
"Lower rank in macpie" = "grey60")) +
labs(x = NULL, y = "Gene count",
fill = "Movement vs DRUGseq",
subtitle = "Only DEGs with padj < 0.05 & log2FC>0 (both methods)") +
theme_classic()
}
In here, you can specify a single condition in the combined_id column
and compare with DMSO (i.e.CB_43_EP73_0). By using the plate IDs in the
column of orig.ident as the input for batch parameter,
compute_singe_de function can perform differential
expression analysis using the preferred method (limma voom in this
example) with batch information.
methods <- c("limma_voom", "DESeq2", "edgeR", "limma_trend")
methods_res <- lapply(methods, function(m){
message("\n","Processing method: ", m,"\n")
# m<-"limma_voom"
treatment_samples <- "FF_86_NH56_10"
control_samples <- "CB_43_EP73_0"
subset <- mac_filtered%>%filter(combined_id%in%c(treatment_samples,control_samples))
batch <- subset$orig.ident
if (m!="limma_trend"){
subset <- filter_genes_by_expression(subset,
group_by = "combined_id", min_counts = 5,
min_samples = 1)
top_table <- compute_single_de(subset, treatment_samples, control_samples, method = m, batch = batch)
} else{
top_table <- limma.trend(data = subset, treatment_samples = treatment_samples, control_samples = control_samples)
}
# plot(plot_volcano(top_table, max.overlaps = 5))
alldmso_degs <- top_table %>% filter(p_value_adj <0.05 & log2FC>0) %>% select(gene) %>% pull()
length(intersect(alldmso_degs, drugseq_deg))
top_table <- top_table %>%
arrange(p_value_adj, desc(log2FC)) %>%
mutate(gene = factor(gene, levels = unique(gene)))
# add a column if there are in the intersect(alldmso_degs, drugseq_deg)
top_table <- top_table %>%
mutate(in_drugseq_deg = ifelse(gene %in% intersect(alldmso_degs, drugseq_deg), "yes", "no"))
plt_ecdf <- ggplot(top_table, aes(x = p_value_adj, color = in_drugseq_deg)) +
stat_ecdf(size = 1) +
scale_x_continuous(trans = "log10", breaks = c(1e-6,1e-4,5e-2)) +
labs(x = "Adjusted p-value (log10 scale)", y = "ECDF",
color = "In DRUGseq DEGs") +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
theme_classic()
ks_test_results <- ks.test(top_table$p_value_adj[top_table$in_drugseq_deg=="yes"],
top_table$p_value_adj[top_table$in_drugseq_deg=="no"], alternative = "greater")
# ggplot(top_table, aes(x = log2FC, y = -log10(p_value_adj), color = in_drugseq_deg)) +
# geom_point(alpha = 0.6, size = 1.2) +
# geom_hline(yintercept = -log10(0.05), linetype = "dashed") +
# geom_vline(xintercept = c(-1, 1), linetype = "dashed") +
# scale_color_manual(values = c("no"="#bdbdbd","yes"="#2b8cbe")) +
# labs(x = "log2FC", y = expression(-log[10]("adj p")), color = "In DRUGseq DEGs") +
# theme_classic()
#
# label a few top overlapping genes
lab_genes <- top_table[top_table$in_drugseq_deg=="yes", ] |>
dplyr::arrange(p_value_adj, dplyr::desc(log2FC))
volcano_overlap <- ggplot(top_table, aes(x = log2FC, y = -log10(p_value_adj), color = in_drugseq_deg)) +
geom_point(alpha = 0.6, size = 1.2) +
geom_text_repel(data = lab_genes, aes(label = gene), size = 3, max.overlaps = 50) +
scale_color_manual(values = c("no"="#bdbdbd","yes"="#2b8cbe"))+
theme_classic()
ks <- c(25,50,100,200,500,1000)
prec_at_k <- sapply(ks, function(k)
mean(top_table$in_drugseq_deg[1:k] == "yes")
)
# plot(ks, prec_at_k, type = "b", xlab = "k (top-k by macpie)",
# ylab = "Precision@k (fraction in DRUGseq set)")
ks_plot <- ggplot(data.frame(k=ks, prec=prec_at_k), aes(x=k, y=prec)) +
geom_point() + geom_line() +
labs(x = "k (top-k by macpie)", y = "Precision@k (fraction in DRUGseq set)")+
theme_classic()
# alluvial plot
macpie_tbl <- top_table %>% filter(p_value_adj < 0.05 & log2FC>0)
drugseq_tbl <- ff86_res_toptable %>% filter(p_value_adj < 0.05 & log2FC>0)
alluvial_plot <- plot_alluvial(macpie_tbl = macpie_tbl, drugseq_tbl = drugseq_tbl)
#return
result_list <- list(top_table = top_table,
num_degs_macpie = length(alldmso_degs),
n_overlap = length(intersect(alldmso_degs, drugseq_deg)),
ecdf_plot = plt_ecdf,
ks_test_results = ks_test_results,
volcano_plot = volcano_overlap,
ks_plot = ks_plot,
alluvial_plot = alluvial_plot)
return(result_list)
})
##
## Processing method: limma_voom
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning in ks.test.default(top_table$p_value_adj[top_table$in_drugseq_deg == :
## p-value will be approximate in the presence of ties
##
## Processing method: DESeq2
## converting counts to integer mode
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
## -- replacing outliers and refitting for 1186 genes
## -- DESeq argument 'minReplicatesForReplace' = 7
## -- original counts are preserved in counts(dds)
## estimating dispersions
## fitting model and testing
## Warning in ks.test.default(top_table$p_value_adj[top_table$in_drugseq_deg == :
## p-value will be approximate in the presence of ties
##
## Processing method: edgeR
## Warning in ks.test.default(top_table$p_value_adj[top_table$in_drugseq_deg == :
## p-value will be approximate in the presence of ties
##
## Processing method: limma_trend
## Warning: Zero sample variances detected, have been offset away from zero
## Warning: p-value will be approximate in the presence of ties
names(methods_res) <- methods
#get a table to show number of DEGs and number of overlapping genes with DRUGseq for each method
deg_summary <- map_df(methods_res, function(x) {
data.frame(
num_degs_macpie = x$num_degs_macpie,
n_overlap = x$n_overlap,
num_degs_DRUGseq = length(drugseq_deg)
)
}, .id = paste0("macpie_methods"))
deg_summary
## macpie_methods num_degs_macpie n_overlap num_degs_DRUGseq
## 1 limma_voom 4280 778 1423
## 2 DESeq2 1959 779 1423
## 3 edgeR 2321 721 1423
## 4 limma_trend 1574 400 1423
methods_res$limma_voom$volcano_plot
## Warning: ggrepel: 746 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
methods_res$DESeq2$volcano_plot
## Warning: Removed 831 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: ggrepel: 738 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
methods_res$edgeR$volcano_plot
## Warning: ggrepel: 654 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
methods_res$limma_trend$volcano_plot
## Warning: ggrepel: 368 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
methods_res$limma_voom$alluvial_plot
## Warning in to_lodes_form(data = data, axes = axis_ind, discern =
## params$discern): Some strata appear at multiple axes.
## Warning in to_lodes_form(data = data, axes = axis_ind, discern =
## params$discern): Some strata appear at multiple axes.
## Warning in to_lodes_form(data = data, axes = axis_ind, discern =
## params$discern): Some strata appear at multiple axes.
methods_res$DESeq2$alluvial_plot
## Warning in to_lodes_form(data = data, axes = axis_ind, discern =
## params$discern): Some strata appear at multiple axes.
## Warning in to_lodes_form(data = data, axes = axis_ind, discern =
## params$discern): Some strata appear at multiple axes.
## Warning in to_lodes_form(data = data, axes = axis_ind, discern =
## params$discern): Some strata appear at multiple axes.
methods_res$edgeR$alluvial_plot
## Warning in to_lodes_form(data = data, axes = axis_ind, discern =
## params$discern): Some strata appear at multiple axes.
## Warning in to_lodes_form(data = data, axes = axis_ind, discern =
## params$discern): Some strata appear at multiple axes.
## Warning in to_lodes_form(data = data, axes = axis_ind, discern =
## params$discern): Some strata appear at multiple axes.
methods_res$limma_trend$alluvial_plot
## Warning in to_lodes_form(data = data, axes = axis_ind, discern =
## params$discern): Some strata appear at multiple axes.
## Warning in to_lodes_form(data = data, axes = axis_ind, discern =
## params$discern): Some strata appear at multiple axes.
## Warning in to_lodes_form(data = data, axes = axis_ind, discern =
## params$discern): Some strata appear at multiple axes.
#append top_table from each method into a single data frame
all_methods_top_table <- map_df(methods_res, "top_table", .id = "method")
head(all_methods_top_table)
## method gene log2FC metric p_value p_value_adj
## 1 limma_voom AL031963.2 1.483560 27.66417 4.119762e-36 8.820411e-32
## 2 limma_voom Z97192.4 5.026037 26.15530 9.565172e-35 1.023952e-30
## 3 limma_voom SLC45A4 1.720577 20.68042 3.573334e-29 2.550170e-25
## 4 limma_voom AC007066.1 1.844768 18.51969 1.356781e-22 7.262170e-19
## 5 limma_voom CYCSP44 1.696008 14.95493 5.065287e-22 1.807463e-18
## 6 limma_voom AP003068.2 1.679486 17.96392 4.522782e-22 1.807463e-18
## in_drugseq_deg
## 1 no
## 2 yes
## 3 yes
## 4 yes
## 5 no
## 6 no
#if in_drugseq_deg is "no", change method to "DRUGseq"
all_methods_top_table <- all_methods_top_table %>%
mutate(method = ifelse(in_drugseq_deg == "no", "not in DRUGseq", method))
#rename methods
all_methods_top_table$method <- recode(all_methods_top_table$method,
"limma_voom" = "macpie:limma_voom",
"DESeq2" = "macpie:DESeq2",
"edgeR" = "macpie:edgeR",
"limma_trend" = "macpie:limma_trend")
ggplot(all_methods_top_table, aes(x = p_value_adj, color = method)) +
stat_ecdf(size = 1) +
scale_x_continuous(trans = "log10", breaks = c(1e-6,1e-4,5e-2)) +
labs(x = "Adjusted p-value (log10 scale)", y = "ECDF",
color = "In DRUGseq DEGs") +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
theme_classic()
## Warning: Removed 831 rows containing non-finite outside the scale range
## (`stat_ecdf()`).
rm(top_table, alldmso_degs, deg_summary)
## Warning in rm(top_table, alldmso_degs, deg_summary): object 'top_table' not
## found
## Warning in rm(top_table, alldmso_degs, deg_summary): object 'alldmso_degs' not
## found
From public DRUG-seq analysis pipeline, authors identified two reference controls: all DMSO wells and the ‘good DMSO’ wells.
We know these good DMSO wells for batch 24 from their published data:
VH02012942: I23, M23
VH02012944: D23, H23
VH02012956: I23, J23
batch24 <- readRDS(paste0(dir,"DRUGseqData/Exp_batch24.Rds"))
names(batch24)
## [1] "VH02012956" "VH02012942" "VH02012944"
#make a combined metadata for three plates
batch24_metadata <- batch24 %>%
map_dfr(~ {
.x$Annotation %>%
mutate(
Plate_ID = plate_barcode,
Well_ID = well_id,
Barcode = paste0(plate_barcode, "_", well_index),
Row = LETTERS[row],
Column = as.integer(col),
Species = "human",
Cell_type = "U2OS",
Model_type = "2D_adherent",
Time = as.factor(hours_post_treatment),
Unit = "h",
Treatment_1 = cmpd_sample_id,
Concentration_1 = as.numeric(concentration),
Unit_1 = unit,
Sample_type = if_else(well_type == "SA" & col == 24,
"Positive Control",
well_type)
)
})
batch24_metadata <- batch24_metadata%>%select(-c(batch_id, plate_barcode,plate_index, well_id,
well_index, col, row, biosample_id, external_biosample_id,
cmpd_sample_id, well_type, cell_line_name, cell_line_ncn, concentration, unit, hours_post_treatment, Sample))
# create a combined UMI matrix for 3 plates
batch24_counts <- batch24 %>%
map(~ {
.x$UMI.counts %>%
as.data.frame() %>%
rownames_to_column("gene_id") %>%
separate(col = gene_id, into = c("gene_name", "chrom"), sep = ",") %>%
mutate(gene_name = make.unique(gene_name)) %>%
select(-chrom) %>%
tibble::column_to_rownames(var = "gene_name") %>%
as.matrix()
})
binded_counts <- do.call(cbind, batch24_counts)
as_mac <- CreateSeuratObject(counts = binded_counts,
min.cells = 1,
min.features = 1)
## Warning: Feature names cannot have underscores ('_'), replacing with dashes
## ('-')
## Warning: Data is of class matrix. Coercing to dgCMatrix.
as_mac<- as_mac%>% inner_join(batch24_metadata, by = c(".cell"="Barcode"))
Filtering steps
Include only good DMSO wells as controls
as_mac$combined_id <- paste0(as_mac$Treatment_1,"_", as_mac$Concentration_1)
badDMSO <- as_mac@meta.data %>% filter(Treatment_1 == "CB-43-EP73") %>%
filter((Plate_ID == "VH02012942" & !(Well_ID %in% c("I23", "M23", "K23", "J23","C23"))) |
(Plate_ID == "VH02012944" & !(Well_ID %in% c("I23", "M23", "J23", "G23", "K23")))|
(Plate_ID == "VH02012956" & ! (Well_ID %in% c("I23", "J23", "O23","M23","K23"))))
keep_wells <- setdiff(rownames(as_mac@meta.data), rownames(badDMSO))
mac_badDSMOremoved <- as_mac[,keep_wells]
mac_badDSMOremoved$combined_id <- str_replace_all(mac_badDSMOremoved$combined_id, "-","_")
min_sample_num <- min(table(mac_badDSMOremoved$combined_id))
mac_badDSMOremoved <- filter_genes_by_expression(mac_badDSMOremoved,
group_by = "combined_id", min_counts =10,
min_samples = min_sample_num)
mac_badDSMOremoved[["percent.mt"]] <- PercentageFeatureSet(mac_badDSMOremoved, pattern = "^mt-|^MT-")
mac_badDSMOremoved[["percent.ribo"]] <- PercentageFeatureSet(mac_badDSMOremoved, pattern = "^Rp[slp][[:digit:]]|^Rpsa|^RP[SLP][[:digit:]]|^RPSA")
p <- plot_plate_layout(mac_badDSMOremoved, "nCount_RNA", "combined_id") + facet_wrap(~orig.ident, ncol = 1) +
theme(strip.text = element_text(size=10),
axis.text.x = element_text(size=10),
axis.text.y = element_text(size=8),
legend.title = element_text(size=10),
legend.text = element_text(size=8),
trip.background = element_blank())
girafe(ggobj = p,
fonts = list(sans = "sans"),
options = list(
opts_hover(css = "stroke:black; stroke-width:1px;")
))
## Warning in plot_theme(plot): The `trip.background` theme element is not defined
## in the element hierarchy.
methods <- c("limma_voom", "DESeq2", "edgeR", "limma_trend")
five_dmso_methods_res <- lapply(methods, function(m){
message("Processing method: ", m)
# m<-"limma_voom"
treatment_samples <- "FF_86_NH56_10"
control_samples <- "CB_43_EP73_0"
subset <- mac_badDSMOremoved%>%filter(combined_id%in%c(treatment_samples,control_samples))
batch <- subset$orig.ident
if (m!="limma_trend"){
# subset <- filter_genes_by_expression(subset,
# group_by = "combined_id", min_counts = 10,
# min_samples = 1)
badDMSO_out_top_table <- compute_single_de(subset, treatment_samples, control_samples, method = m, batch = batch)
} else{
badDMSO_out_top_table <- limma.trend(data = subset, treatment_samples = treatment_samples, control_samples = control_samples)
}
# plot(plot_volcano(top_table, max.overlaps = 5))
badDMSO_out_degs <- badDMSO_out_top_table %>% filter(p_value_adj <0.05 & log2FC>0) %>% select(gene) %>% pull()
length(intersect(badDMSO_out_degs, drugseq_deg))
badDMSO_out_top_table <- badDMSO_out_top_table %>%
arrange(p_value_adj, desc(log2FC)) %>%
mutate(gene = factor(gene, levels = unique(gene)))
# add a column if there are in the intersect(alldmso_degs, drugseq_deg)
badDMSO_out_top_table <- badDMSO_out_top_table %>%
mutate(in_drugseq_deg = ifelse(gene %in% intersect(badDMSO_out_degs, drugseq_deg), "yes", "no"))
plt_ecdf <- ggplot(badDMSO_out_top_table, aes(x = p_value_adj, color = in_drugseq_deg)) +
stat_ecdf(size = 1) +
scale_x_continuous(trans = "log10", breaks = c(1e-6,1e-4,5e-2)) +
labs(x = "Adjusted p-value (log10 scale)", y = "ECDF",
color = "In DRUGseq DEGs") +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
theme_classic()
ks_test_results <- ks.test(badDMSO_out_top_table$p_value_adj[badDMSO_out_top_table$in_drugseq_deg=="yes"],
badDMSO_out_top_table$p_value_adj[badDMSO_out_top_table$in_drugseq_deg=="no"], alternative = "greater")
# ggplot(top_table, aes(x = log2FC, y = -log10(p_value_adj), color = in_drugseq_deg)) +
# geom_point(alpha = 0.6, size = 1.2) +
# geom_hline(yintercept = -log10(0.05), linetype = "dashed") +
# geom_vline(xintercept = c(-1, 1), linetype = "dashed") +
# scale_color_manual(values = c("no"="#bdbdbd","yes"="#2b8cbe")) +
# labs(x = "log2FC", y = expression(-log[10]("adj p")), color = "In DRUGseq DEGs") +
# theme_classic()
#
# label a few top overlapping genes
lab_genes <- badDMSO_out_top_table[badDMSO_out_top_table$in_drugseq_deg=="yes", ] |>
dplyr::arrange(p_value_adj, dplyr::desc(log2FC))
volcano_overlap <- ggplot(badDMSO_out_top_table, aes(x = log2FC, y = -log10(p_value_adj), color = in_drugseq_deg)) +
geom_point(alpha = 0.6, size = 1.2) +
geom_text_repel(data = lab_genes, aes(label = gene), size = 3, max.overlaps = 10) +
scale_color_manual(values = c("no"="#bdbdbd","yes"="#2b8cbe"))+
theme_classic()
ks <- c(25,50,100,200,500,1000)
prec_at_k <- sapply(ks, function(k)
mean(badDMSO_out_top_table$in_drugseq_deg[1:k] == "yes")
)
# plot(ks, prec_at_k, type = "b", xlab = "k (top-k by macpie)",
# ylab = "Precision@k (fraction in DRUGseq set)")
ks_plot <- ggplot(data.frame(k=ks, prec=prec_at_k), aes(x=k, y=prec)) +
geom_point() + geom_line() +
labs(x = "k (top-k by macpie)", y = "Precision@k (fraction in DRUGseq set)")+
theme_classic()
# alluvial plot
macpie_tbl <- badDMSO_out_top_table %>% filter(p_value_adj < 0.05 & log2FC>0)
drugseq_tbl <- ff86_res_toptable %>% filter(p_value_adj < 0.05 & log2FC>0)
alluvial_plot <- plot_alluvial(macpie_tbl = macpie_tbl, drugseq_tbl = drugseq_tbl)
#return
result_list <- list(top_table = badDMSO_out_top_table,
num_degs_macpie = length(badDMSO_out_degs),
n_overlap = length(intersect(badDMSO_out_degs, drugseq_deg)),
ecdf_plot = plt_ecdf,
ks_test_results = ks_test_results,
volcano_plot = volcano_overlap,
ks_plot = ks_plot,
alluvial_plot = alluvial_plot)
return(result_list)
})
## Processing method: limma_voom
## Warning in
## ks.test.default(badDMSO_out_top_table$p_value_adj[badDMSO_out_top_table$in_drugseq_deg
## == : p-value will be approximate in the presence of ties
## Processing method: DESeq2
## converting counts to integer mode
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
## Warning in
## ks.test.default(badDMSO_out_top_table$p_value_adj[badDMSO_out_top_table$in_drugseq_deg
## == : p-value will be approximate in the presence of ties
## Processing method: edgeR
## Warning in
## ks.test.default(badDMSO_out_top_table$p_value_adj[badDMSO_out_top_table$in_drugseq_deg
## == : p-value will be approximate in the presence of ties
## Processing method: limma_trend
## Warning: Zero sample variances detected, have been offset away from zero
## Warning: p-value will be approximate in the presence of ties
names(five_dmso_methods_res) <- methods
#get a table to show number of DEGs and number of overlapping genes with DRUGseq for each method
deg_summary <- map_df(five_dmso_methods_res, function(x) {
data.frame(
num_degs_macpie = x$num_degs_macpie,
n_overlap = x$n_overlap,
num_degs_DRUGseq = length(drugseq_deg)
)
}, .id = paste0("macpie_methods"))
deg_summary
## macpie_methods num_degs_macpie n_overlap num_degs_DRUGseq
## 1 limma_voom 3456 592 1423
## 2 DESeq2 1107 549 1423
## 3 edgeR 2185 690 1423
## 4 limma_trend 5 2 1423
#append top_table from each method into a single data frame
good_dmso_top_table <- map_df(five_dmso_methods_res, "top_table", .id = "method")
head(good_dmso_top_table)
## method gene log2FC metric p_value p_value_adj
## 1 limma_voom AL031963.2 1.477162 34.73174 4.135992e-26 1.048226e-21
## 2 limma_voom SLC45A4 1.746060 25.83312 2.602120e-22 3.297406e-18
## 3 limma_voom Z97192.4 5.194480 22.09739 2.424226e-20 2.047986e-16
## 4 limma_voom AP003068.2 1.691204 26.07754 6.263377e-20 3.968475e-16
## 5 limma_voom AL583807.1 2.808626 22.32748 2.810497e-18 1.187154e-14
## 6 limma_voom AC007066.1 1.833326 23.05883 2.624228e-18 1.187154e-14
## in_drugseq_deg
## 1 no
## 2 yes
## 3 yes
## 4 no
## 5 no
## 6 yes
#if in_drugseq_deg is "no", change method to "DRUGseq"
good_dmso_top_table<- good_dmso_top_table %>%
mutate(method = ifelse(in_drugseq_deg == "no", "not_in_DRUGseq", method))
#rename methods
good_dmso_top_table$method <- recode(good_dmso_top_table$method,
"limma_voom" = "macpie:limma_voom",
"DESeq2" = "macpie:DESeq2",
"edgeR" = "macpie:edgeR",
"limma_trend" = "macpie:limma_trend")
ggplot(good_dmso_top_table, aes(x = p_value_adj, color = method)) +
stat_ecdf(size = 1) +
scale_x_continuous(trans = "log10", breaks = c(1e-6,1e-4,5e-2)) +
labs(x = "Adjusted p-value (log10 scale)", y = "ECDF",
color = "In DRUGseq DEGs") +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
theme_classic()
## Warning: Removed 12331 rows containing non-finite outside the scale range
## (`stat_ecdf()`).
five_dmso_methods_res$limma_voom$volcano_plot
## Warning: ggrepel: 575 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
five_dmso_methods_res$DESeq2$volcano_plot
## Warning: Removed 12331 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: ggrepel: 536 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
five_dmso_methods_res$edgeR$volcano_plot
## Warning: ggrepel: 662 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
five_dmso_methods_res$limma_trend$volcano_plot
five_dmso_methods_res$limma_voom$alluvial_plot
## Warning in to_lodes_form(data = data, axes = axis_ind, discern =
## params$discern): Some strata appear at multiple axes.
## Warning in to_lodes_form(data = data, axes = axis_ind, discern =
## params$discern): Some strata appear at multiple axes.
## Warning in to_lodes_form(data = data, axes = axis_ind, discern =
## params$discern): Some strata appear at multiple axes.
five_dmso_methods_res$DESeq2$alluvial_plot
## Warning in to_lodes_form(data = data, axes = axis_ind, discern =
## params$discern): Some strata appear at multiple axes.
## Warning in to_lodes_form(data = data, axes = axis_ind, discern =
## params$discern): Some strata appear at multiple axes.
## Warning in to_lodes_form(data = data, axes = axis_ind, discern =
## params$discern): Some strata appear at multiple axes.
five_dmso_methods_res$edgeR$alluvial_plot
## Warning in to_lodes_form(data = data, axes = axis_ind, discern =
## params$discern): Some strata appear at multiple axes.
## Warning in to_lodes_form(data = data, axes = axis_ind, discern =
## params$discern): Some strata appear at multiple axes.
## Warning in to_lodes_form(data = data, axes = axis_ind, discern =
## params$discern): Some strata appear at multiple axes.
five_dmso_methods_res$limma_trend$alluvial_plot
## Warning in to_lodes_form(data = data, axes = axis_ind, discern =
## params$discern): Some strata appear at multiple axes.
## Warning in to_lodes_form(data = data, axes = axis_ind, discern =
## params$discern): Some strata appear at multiple axes.
## Warning in to_lodes_form(data = data, axes = axis_ind, discern =
## params$discern): Some strata appear at multiple axes.
From public DRUG-seq analysis pipeline, authors identified two reference controls: all DMSO wells and the ‘good DMSO’ wells.
We know these good DMSO wells for batch 24 from their published data:
VH02012942: I23, M23
VH02012944: D23, H23
VH02012956: I23, J23
batch24 <- readRDS(paste0(dir,"DRUGseqData/Exp_batch24.Rds"))
names(batch24)
## [1] "VH02012956" "VH02012942" "VH02012944"
#make a combined metadata for three plates
batch24_metadata <- batch24 %>%
map_dfr(~ {
.x$Annotation %>%
mutate(
Plate_ID = plate_barcode,
Well_ID = well_id,
Barcode = paste0(plate_barcode, "_", well_index),
Row = LETTERS[row],
Column = as.integer(col),
Species = "human",
Cell_type = "U2OS",
Model_type = "2D_adherent",
Time = as.factor(hours_post_treatment),
Unit = "h",
Treatment_1 = cmpd_sample_id,
Concentration_1 = as.numeric(concentration),
Unit_1 = unit,
Sample_type = if_else(well_type == "SA" & col == 24,
"Positive Control",
well_type)
)
})
batch24_metadata <- batch24_metadata%>%select(-c(batch_id, plate_barcode,plate_index, well_id,
well_index, col, row, biosample_id, external_biosample_id,
cmpd_sample_id, well_type, cell_line_name, cell_line_ncn, concentration, unit, hours_post_treatment, Sample))
# create a combined UMI matrix for 3 plates
batch24_counts <- batch24 %>%
map(~ {
.x$UMI.counts %>%
as.data.frame() %>%
rownames_to_column("gene_id") %>%
separate(col = gene_id, into = c("gene_name", "chrom"), sep = ",") %>%
mutate(gene_name = make.unique(gene_name)) %>%
select(-chrom) %>%
tibble::column_to_rownames(var = "gene_name") %>%
as.matrix()
})
binded_counts <- do.call(cbind, batch24_counts)
as_mac <- CreateSeuratObject(counts = binded_counts,
min.cells = 1,
min.features = 1)
## Warning: Feature names cannot have underscores ('_'), replacing with dashes
## ('-')
## Warning: Data is of class matrix. Coercing to dgCMatrix.
as_mac<- as_mac%>% inner_join(batch24_metadata, by = c(".cell"="Barcode"))
Filtering steps
Include only good DMSO wells as controls
as_mac$combined_id <- paste0(as_mac$Treatment_1,"_", as_mac$Concentration_1)
badDMSO <- as_mac@meta.data %>% filter(Treatment_1 == "CB-43-EP73") %>%
filter((Plate_ID == "VH02012942" & !(Well_ID %in% c("I23", "M23"))) |
(Plate_ID == "VH02012944" & !(Well_ID %in% c("D23", "H23")))|
(Plate_ID == "VH02012956" & ! (Well_ID %in% c("I23", "J23"))))
keep_wells <- setdiff(rownames(as_mac@meta.data), rownames(badDMSO))
mac_badDSMOremoved <- as_mac[,keep_wells]
mac_badDSMOremoved$combined_id <- str_replace_all(mac_badDSMOremoved$combined_id, "-","_")
min_sample_num <- min(table(mac_badDSMOremoved$combined_id))
mac_badDSMOremoved <- filter_genes_by_expression(mac_badDSMOremoved,
group_by = "combined_id", min_counts =10,
min_samples = min_sample_num)
mac_badDSMOremoved[["percent.mt"]] <- PercentageFeatureSet(mac_badDSMOremoved, pattern = "^mt-|^MT-")
mac_badDSMOremoved[["percent.ribo"]] <- PercentageFeatureSet(mac_badDSMOremoved, pattern = "^Rp[slp][[:digit:]]|^Rpsa|^RP[SLP][[:digit:]]|^RPSA")
methods <- c("limma_voom", "DESeq2", "edgeR", "limma_trend")
good_dmso_methods_res <- lapply(methods, function(m){
message("Processing method: ", m)
# m<-"limma_voom"
treatment_samples <- "FF_86_NH56_10"
control_samples <- "CB_43_EP73_0"
subset <- mac_badDSMOremoved%>%filter(combined_id%in%c(treatment_samples,control_samples))
batch <- subset$orig.ident
if (m!="limma_trend"){
# subset <- filter_genes_by_expression(subset,
# group_by = "combined_id", min_counts = 10,
# min_samples = 1)
badDMSO_out_top_table <- compute_single_de(subset, treatment_samples, control_samples, method = m, batch = batch)
} else{
badDMSO_out_top_table <- limma.trend(data = subset, treatment_samples = treatment_samples, control_samples = control_samples)
}
# plot(plot_volcano(top_table, max.overlaps = 5))
badDMSO_out_degs <- badDMSO_out_top_table %>% filter(p_value_adj <0.05 & log2FC>0) %>% select(gene) %>% pull()
length(intersect(badDMSO_out_degs, drugseq_deg))
badDMSO_out_top_table <- badDMSO_out_top_table %>%
arrange(p_value_adj, desc(log2FC)) %>%
mutate(gene = factor(gene, levels = unique(gene)))
# add a column if there are in the intersect(alldmso_degs, drugseq_deg)
badDMSO_out_top_table <- badDMSO_out_top_table %>%
mutate(in_drugseq_deg = ifelse(gene %in% intersect(badDMSO_out_degs, drugseq_deg), "yes", "no"))
plt_ecdf <- ggplot(badDMSO_out_top_table, aes(x = p_value_adj, color = in_drugseq_deg)) +
stat_ecdf(size = 1) +
scale_x_continuous(trans = "log10", breaks = c(1e-6,1e-4,5e-2)) +
labs(x = "Adjusted p-value (log10 scale)", y = "ECDF",
color = "In DRUGseq DEGs") +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
theme_classic()
ks_test_results <- ks.test(badDMSO_out_top_table$p_value_adj[badDMSO_out_top_table$in_drugseq_deg=="yes"],
badDMSO_out_top_table$p_value_adj[badDMSO_out_top_table$in_drugseq_deg=="no"], alternative = "greater")
# ggplot(top_table, aes(x = log2FC, y = -log10(p_value_adj), color = in_drugseq_deg)) +
# geom_point(alpha = 0.6, size = 1.2) +
# geom_hline(yintercept = -log10(0.05), linetype = "dashed") +
# geom_vline(xintercept = c(-1, 1), linetype = "dashed") +
# scale_color_manual(values = c("no"="#bdbdbd","yes"="#2b8cbe")) +
# labs(x = "log2FC", y = expression(-log[10]("adj p")), color = "In DRUGseq DEGs") +
# theme_classic()
#
# label a few top overlapping genes
lab_genes <- badDMSO_out_top_table[badDMSO_out_top_table$in_drugseq_deg=="yes", ] |>
dplyr::arrange(p_value_adj, dplyr::desc(log2FC))
volcano_overlap <- ggplot(badDMSO_out_top_table, aes(x = log2FC, y = -log10(p_value_adj), color = in_drugseq_deg)) +
geom_point(alpha = 0.6, size = 1.2) +
geom_text_repel(data = lab_genes, aes(label = gene), size = 3, max.overlaps = 10) +
scale_color_manual(values = c("no"="#bdbdbd","yes"="#2b8cbe"))+
theme_classic()
ks <- c(25,50,100,200,500,1000)
prec_at_k <- sapply(ks, function(k)
mean(badDMSO_out_top_table$in_drugseq_deg[1:k] == "yes")
)
# plot(ks, prec_at_k, type = "b", xlab = "k (top-k by macpie)",
# ylab = "Precision@k (fraction in DRUGseq set)")
ks_plot <- ggplot(data.frame(k=ks, prec=prec_at_k), aes(x=k, y=prec)) +
geom_point() + geom_line() +
labs(x = "k (top-k by macpie)", y = "Precision@k (fraction in DRUGseq set)")+
theme_classic()
# alluvial plot
macpie_tbl <- badDMSO_out_top_table %>% filter(p_value_adj < 0.05 & log2FC>0)
drugseq_tbl <- ff86_res_toptable %>% filter(p_value_adj < 0.05 & log2FC>0)
alluvial_plot <- plot_alluvial(macpie_tbl = macpie_tbl, drugseq_tbl = drugseq_tbl)
#return
result_list <- list(top_table = badDMSO_out_top_table,
num_degs_macpie = length(badDMSO_out_degs),
n_overlap = length(intersect(badDMSO_out_degs, drugseq_deg)),
ecdf_plot = plt_ecdf,
ks_test_results = ks_test_results,
volcano_plot = volcano_overlap,
ks_plot = ks_plot,
alluvial_plot = alluvial_plot)
return(result_list)
})
## Processing method: limma_voom
## Warning in
## ks.test.default(badDMSO_out_top_table$p_value_adj[badDMSO_out_top_table$in_drugseq_deg
## == : p-value will be approximate in the presence of ties
## Processing method: DESeq2
## converting counts to integer mode
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
## Warning in
## ks.test.default(badDMSO_out_top_table$p_value_adj[badDMSO_out_top_table$in_drugseq_deg
## == : p-value will be approximate in the presence of ties
## Processing method: edgeR
## Warning in
## ks.test.default(badDMSO_out_top_table$p_value_adj[badDMSO_out_top_table$in_drugseq_deg
## == : p-value will be approximate in the presence of ties
## Processing method: limma_trend
## Warning: Zero sample variances detected, have been offset away from zero
## Warning: p-value will be approximate in the presence of ties
names(good_dmso_methods_res) <- methods
#get a table to show number of DEGs and number of overlapping genes with DRUGseq for each method
deg_summary <- map_df(good_dmso_methods_res, function(x) {
data.frame(
num_degs_macpie = x$num_degs_macpie,
n_overlap = x$n_overlap,
num_degs_DRUGseq = length(drugseq_deg)
)
}, .id = paste0("macpie_methods"))
deg_summary
## macpie_methods num_degs_macpie n_overlap num_degs_DRUGseq
## 1 limma_voom 1352 204 1423
## 2 DESeq2 10 8 1423
## 3 edgeR 1757 604 1423
## 4 limma_trend 2 1 1423
#append top_table from each method into a single data frame
good_dmso_top_table <- map_df(good_dmso_methods_res, "top_table", .id = "method")
head(good_dmso_top_table)
## method gene log2FC metric p_value p_value_adj
## 1 limma_voom AL031963.2 1.450009 52.04866 0.000000e+00 0.000000e+00
## 2 limma_voom RNU6-633P 1.254947 72.59594 0.000000e+00 0.000000e+00
## 3 limma_voom ELOCP10 1.269453 35.96512 2.086923e-279 1.762685e-275
## 4 limma_voom AP003068.2 1.677015 35.65408 1.072200e-274 6.792120e-271
## 5 limma_voom AC007066.1 1.821885 32.63913 4.558729e-220 2.310273e-216
## 6 limma_voom SLC45A4 1.745313 31.67705 6.854635e-218 2.894826e-214
## in_drugseq_deg
## 1 no
## 2 no
## 3 no
## 4 no
## 5 yes
## 6 yes
#if in_drugseq_deg is "no", change method to "DRUGseq"
good_dmso_top_table<- good_dmso_top_table %>%
mutate(method = ifelse(in_drugseq_deg == "no", "not_in_DRUGseq", method))
#rename methods
good_dmso_top_table$method <- recode(good_dmso_top_table$method,
"limma_voom" = "macpie:limma_voom",
"DESeq2" = "macpie:DESeq2",
"edgeR" = "macpie:edgeR",
"limma_trend" = "macpie:limma_trend")
ggplot(good_dmso_top_table, aes(x = p_value_adj, color = method)) +
stat_ecdf(size = 1) +
scale_x_continuous(trans = "log10", breaks = c(1e-6,1e-4,5e-2)) +
labs(x = "Adjusted p-value (log10 scale)", y = "ECDF",
color = "In DRUGseq DEGs") +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
theme_classic()
## Warning in scale_x_continuous(trans = "log10", breaks = c(1e-06, 1e-04, :
## log-10 transformation introduced infinite values.
## Warning: Removed 19707 rows containing non-finite outside the scale range
## (`stat_ecdf()`).
good_dmso_methods_res$limma_voom$ecdf_plot
## Warning in scale_x_continuous(trans = "log10", breaks = c(1e-06, 1e-04, :
## log-10 transformation introduced infinite values.
## Warning: Removed 2 rows containing non-finite outside the scale range
## (`stat_ecdf()`).
good_dmso_methods_res$DESeq2$ecdf_plot
## Warning: Removed 19705 rows containing non-finite outside the scale range
## (`stat_ecdf()`).
good_dmso_methods_res$edgeR$ecdf_plot
good_dmso_methods_res$limma_trend$ecdf_plot
### Overlapping volcano plot
good_dmso_methods_res$limma_voom$volcano_plot
## Warning: ggrepel: 196 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
good_dmso_methods_res$DESeq2$volcano_plot
## Warning: Removed 19705 rows containing missing values or values outside the scale range
## (`geom_point()`).
good_dmso_methods_res$edgeR$volcano_plot
## Warning: ggrepel: 583 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
good_dmso_methods_res$limma_trend$volcano_plot
good_dmso_methods_res$limma_voom$alluvial_plot
## Warning in to_lodes_form(data = data, axes = axis_ind, discern =
## params$discern): Some strata appear at multiple axes.
## Warning in to_lodes_form(data = data, axes = axis_ind, discern =
## params$discern): Some strata appear at multiple axes.
## Warning in to_lodes_form(data = data, axes = axis_ind, discern =
## params$discern): Some strata appear at multiple axes.
good_dmso_methods_res$DESeq2$alluvial_plot
## Warning in to_lodes_form(data = data, axes = axis_ind, discern =
## params$discern): Some strata appear at multiple axes.
## Warning in to_lodes_form(data = data, axes = axis_ind, discern =
## params$discern): Some strata appear at multiple axes.
## Warning in to_lodes_form(data = data, axes = axis_ind, discern =
## params$discern): Some strata appear at multiple axes.
good_dmso_methods_res$edgeR$alluvial_plot
## Warning in to_lodes_form(data = data, axes = axis_ind, discern =
## params$discern): Some strata appear at multiple axes.
## Warning in to_lodes_form(data = data, axes = axis_ind, discern =
## params$discern): Some strata appear at multiple axes.
## Warning in to_lodes_form(data = data, axes = axis_ind, discern =
## params$discern): Some strata appear at multiple axes.
good_dmso_methods_res$limma_trend$alluvial_plot
## Warning in to_lodes_form(data = data, axes = axis_ind, discern =
## params$discern): Some strata appear at multiple axes.
## Warning in to_lodes_form(data = data, axes = axis_ind, discern =
## params$discern): Some strata appear at multiple axes.
## Warning in to_lodes_form(data = data, axes = axis_ind, discern =
## params$discern): Some strata appear at multiple axes.
To compare DEGs with different replicate numbers and different methods
methods <- c("limma_voom", "DESeq2", "edgeR", "limma_trend")
get_jaccard <- function(deg_set, drugseq_deg){
intersection <- length(intersect(deg_set, drugseq_deg))
union <- length(union(deg_set, drugseq_deg))
jaccard_index <- intersection / union
return(jaccard_index)
}
jaccard_index <- lapply(methods, function(m){
# all dmso
degs <- methods_res[[m]]$top_table %>% filter(p_value_adj <0.05 & log2FC>0) %>% select(gene) %>% pull()
jaccard_all <- get_jaccard(degs, drugseq_deg)
# five dmso
degs <- five_dmso_methods_res[[m]]$top_table %>% filter(p_value_adj <0.05 & log2FC>0) %>% select(gene) %>% pull()
jaccard_five <- get_jaccard(degs, drugseq_deg)
# three dmso
degs <- good_dmso_methods_res[[m]]$top_table %>% filter(p_value_adj <0.05 & log2FC>0) %>% select(gene) %>% pull()
jaccard_three <- get_jaccard(degs, drugseq_deg)
jaccard_index <- data.frame(
method = m,
jaccard_all = jaccard_all,
jaccard_five = jaccard_five,
jaccard_three = jaccard_three
)
return(jaccard_index)
})
df <- as.data.frame(do.call(rbind, jaccard_index))
rownames(df) <- df$method
df <- df %>% select(-method)
colnames(df) <- c("All DMSO", "macpie: 15 DMSO", "DRUGseq: 6 DMSO")
pheatmap::pheatmap(df,
cluster_rows = FALSE,
cluster_cols = FALSE,
display_numbers = TRUE,
main = "Jaccard Index between macpie DEGs and DRUGseq DEGs")
library(UpSetR)
all_dmso <- list(
limma_voom = methods_res$limma_voom$top_table %>% filter(p_value_adj <0.05 & log2FC>0) %>% select(gene) %>% pull(),
DESeq2 = methods_res$DESeq2$top_table %>% filter(p_value_adj <0.05 & log2FC>0) %>% select(gene) %>% pull(),
edgeR = methods_res$edgeR$top_table %>% filter(p_value_adj <0.05 & log2FC>0) %>% select(gene) %>% pull(),
limma_trend = methods_res$limma_trend$top_table %>% filter(p_value_adj <0.05 & log2FC>0) %>% select(gene) %>% pull(),
DRUGseq = drugseq_deg
)
upset(fromList(all_dmso),
nsets = 5,
order.by = "freq",
main.bar.color = "black",
sets.bar.color = "gray23",
text.scale = c(2, 2, 2, 1.5, 2, 1.5))
five_dmso <- list(
limma_voom = five_dmso_methods_res$limma_voom$top_table %>% filter(p_value_adj <0.05 & log2FC>0) %>% select(gene) %>% pull(),
DESeq2 = five_dmso_methods_res$DESeq2$top_table %>% filter(p_value_adj <0.05 & log2FC>0) %>% select(gene) %>% pull(),
edgeR = five_dmso_methods_res$edgeR$top_table %>% filter(p_value_adj <0.05 & log2FC>0) %>% select(gene) %>% pull(),
limma_trend = five_dmso_methods_res$limma_trend$top_table %>% filter(p_value_adj <0.05 & log2FC>0) %>% select(gene) %>% pull(),
DRUGseq = drugseq_deg
)
upset(fromList(five_dmso),
nsets = 5,
order.by = "freq",
main.bar.color = "black",
sets.bar.color = "gray23",
text.scale = c(2, 2, 2, 1.5, 2, 1.5))
good_dmso <- list(
limma_voom = good_dmso_methods_res$limma_voom$top_table %>% filter(p_value_adj <0.05 & log2FC>0) %>% select(gene) %>% pull(),
DESeq2 = good_dmso_methods_res$DESeq2$top_table %>% filter(p_value_adj <0.05 & log2FC>0) %>% select(gene) %>% pull(),
edgeR = good_dmso_methods_res$edgeR$top_table %>% filter(p_value_adj <0.05 & log2FC>0) %>% select(gene) %>% pull(),
limma_trend = good_dmso_methods_res$limma_trend$top_table %>% filter(p_value_adj <0.05 & log2FC>0) %>% select(gene) %>% pull(),
DRUGseq = drugseq_deg
)
upset(fromList(good_dmso),
nsets = 5,
order.by = "freq",
main.bar.color = "black",
sets.bar.color = "gray23",
text.scale = c(2, 2, 2, 1.5, 2, 1.5))